Load Packages

library(knitr)        # for kable tables
library(performance)  # for model checking
library(DHARMa)       # for model checking
library(effects)      # for nice summary of model results (allEffects)
library(broom.mixed)  # for tidy output
library(ggplotify)    # to use patchwork on lattice plots (created by allEffects)
library(MuMIn)        # to calculate R2
library(pROC)         # for ROC curves and AUC
library(patchwork)    # for combining ggplot plots
library(tidyverse)

Load Data

# Update this when have final results from workshop:
PGS_df <- read.table(file = "/path/PGS_Workshop/01_Create_PGS/Plink_PGS_scores/ILAE3_Caucasian_all_epilepsy_EUR.QC_PGS_scores.profile", header = T)

# Read in covariates data
cov_df <- read.table(file = "/path/PGS_Workshop/Genotype_data/EUR.cov", header = T)

# Read in ancestry PCs data
PCs_df <- read.table(file = "/path/PGS_Workshop/Genotype_data/EUR.eigenvec", header = F, col.names = c("FID", "IID", "PC1", "PC2", "PC3", "PC4", "PC5", "PC6"))
  
# Relatedness data
related_df <- read.table(file = "/path/PGS_Workshop/02_Evaluate_PGS/EUR_relatedness.genome", header = T)

Tidy Data

# Merge these three dataframes (PGS, covariates and PCs) and add simulated data for our phenotype = epilepsy (case/control)

set.seed(123) # set seed for reproducibility
Df <- PGS_df %>% 
  left_join(cov_df) %>% 
  left_join(PCs_df) %>% 
  mutate(
    lp = -1.5 + 0.8 * scale(SCORESUM) + 
      0.05 * Sex + 0.01 * PC1 + 0.005 * PC2 +
      0.005 * PC3 + 0.00001 * PC4 + 0.0003 * PC5 + 0.00008 * PC6,
    prob = plogis(lp),
    Epilepsy = rbinom(n(), size = 1, prob = prob)) %>% 
  select(-c(lp, prob))



# Remove 1 individual from each pair that have pi_hat > 0.1875 with controls preferentially removed.
# This removes individuals that are first- and second-degress relatives (0.1875 is halfway between 2nd and 3rd degree relatives)
# Note in our data that related_df is empty - no individuals have pi_hat > 0.09 (this dataset was QCed and related individuals were already removed)
# So this code is commented out, but you can use it if your data includes related individuals

# # Df with case/ctrl status for each IID
# Epilepsy_status <- Df %>% 
#   select(IID, Epilepsy)
# 
# # Join related df with CP Epilepsy_status so have case/control status for both IID1 and IID2
# 
# related_epilepsy_status <- related_df %>%
#   left_join(Epilepsy_status, by = c("IID1" = "IID")) %>%
#   rename(Epilepsy_IID1 = Epilepsy) %>%
#   left_join(Epilepsy_status, by = c("IID2" = "IID")) %>%
#   rename(Epilepsy_IID2 = Epilepsy)
# 
# # Filter pairs with PI_HAT > 0.1875, preferentially removing controls and keeping cases
# related_remove <- related_epilepsy_status %>%
#   filter(PI_HAT > 0.1875) %>%
#   rowwise() %>% 
#   mutate(
#     remove = case_when(
#       Epilepsy_IID1 == 0 & Epilepsy_IID2 == 1 ~ IID1,  # Prefer removing control (IID1)
#       Epilepsy_IID1 == 1 & Epilepsy_IID2 == 0 ~ IID2,  # Prefer removing control (IID2)
#       Epilepsy_IID1 == 0 & Epilepsy_IID2 == 0 ~ sample(c(IID1, IID2), 1),  # Randomly remove one if both are controls
#       Epilepsy_IID1 == 1 & Epilepsy_IID2 == 1 ~ sample(c(IID1, IID2), 1)   # Randomly remove one if both are cases
#     )
#   ) %>% 
#   select(remove)
# 
# # Remove these individuals from df
# Df <- Df %>% 
#   anti_join(related_remove, by = join_by("IID" == "remove"))

save(Df, file = "Data/Df.Rdata")

Logistic Regression

  • Determine direction of relationship between outcome (epilepsy case/control) and predictor (PGS for epilepsy)

Explore Data

  • Data exploration before fitting model
# Data exploration before fitting regression models

# Check PGS before and after standardisation
Df %>%
  summarise(
    mean_PRS = mean(SCORESUM, na.rm = TRUE),
    sd_PRS = sd(SCORESUM, na.rm = TRUE),
    mean_PRS_standardised = mean(scale(SCORESUM), na.rm = TRUE),
    sd_PRS_standardised = sd(scale(SCORESUM), na.rm = TRUE)) %>% 
  kable()
mean_PRS sd_PRS mean_PRS_standardised sd_PRS_standardised
-0.0193872 0.0116532 0 1
# Plot distribution of standardised PGS split by epilepsy case/control
means <- Df %>%
  mutate(SCORESUM_scaled = scale(SCORESUM)) %>%
      group_by(Epilepsy) %>% 
      summarise(mean_value = mean(SCORESUM_scaled, na.rm = TRUE))
labels <- c("1" = "Case", "0" = "Control")
    
ggplot(Df, aes(x = scale(SCORESUM), colour = as.factor(Epilepsy), fill = as.factor(Epilepsy))) +
      geom_density(alpha = 0.6, position = "identity") +
      geom_vline(data = means, aes(xintercept = mean_value, colour = as.factor(Epilepsy)),
                 linetype = "dashed", linewidth = 1, alpha = 1,
                 show.legend = F) +
      scale_colour_viridis_d(name = "Epilepsy", labels = labels) +
      scale_fill_viridis_d(name = "Epilepsy", labels = labels) +
      scale_x_continuous(paste("Polygenic Score for epilepsy (standardised)")) +
      scale_y_continuous("") +
      theme_classic() +
      theme(text = element_text(family = "Calibri"),
            axis.text = element_text(size = 10),
            axis.title.x = element_text(size = 12, colour = "black", margin = margin(10,0,0,0)),
            axis.title.y = element_text(size = 12, colour = "black", margin = margin(0,10,0,0)),
            legend.title = element_blank(),
            legend.text = element_text(size = 10, colour = "black"),
            legend.position = "right")

Fit Models

Check Models

  • Check model assumptions are met

Covariates only model

performance::check_model(logistic_regression_covar)

simulateResiduals(logistic_regression_covar, plot=TRUE)

## Object of Class DHARMa with simulated residuals based on 250 simulations with refit = FALSE . See ?DHARMa::simulateResiduals for help. 
##  
## Scaled residual values: 0.5342695 0.3987193 0.7909382 0.9275532 0.9366849 0.5574282 0.5132182 0.3413922 0.7306458 0.4876829 0.9584471 0.6598171 0.3993855 0.294575 0.3646907 0.8758508 0.7393357 0.6822836 0.4953915 0.9085732 ...

Full model including PGS

performance::check_model(logistic_regression_PGS)

simulateResiduals(logistic_regression_PGS, plot=TRUE)

## Object of Class DHARMa with simulated residuals based on 250 simulations with refit = FALSE . See ?DHARMa::simulateResiduals for help. 
##  
## Scaled residual values: 0.4588432 0.4242531 0.9105435 0.879569 0.9599435 0.6487582 0.6422395 0.4602313 0.5411003 0.6160205 0.9722981 0.712462 0.3517316 0.3558069 0.4031858 0.8432616 0.8202006 0.8273669 0.4903365 0.8996097 ...

Results

  • Check odds ratio - is the relationship between epilepsy (case/control) and PGS for epilepsy in the expected direction (i.e. positive, so an OR > 1)?
    • If the relationship is in the opposite direction to what is expected, check that the columns for effect allele and other allele were correct (and not flipped) in the GWAS summary statistics used to make the PGS

Table

tidy(logistic_regression_PGS, conf.int = TRUE, conf.level = 0.95, exponentiate = TRUE) %>%
  filter(str_detect(term, "SCORESUM")) %>%
  kable(caption = "Results from logistic regression (back-transformation conducted so estimate = odds ratio")
Results from logistic regression (back-transformation conducted so estimate = odds ratio
term estimate std.error statistic p.value conf.low conf.high
scale(SCORESUM) 1.893025 0.1284467 4.968408 7e-07 1.48026 2.451792

Graph

or_table <- tidy(logistic_regression_PGS, exponentiate = TRUE, conf.int = TRUE, conf.level = 0.95,) %>%
  filter(str_detect(term, "SCORESUM")) %>%
  mutate(
    term = str_replace(term, "scale\\(SCORESUM\\)", "PGS"),
    sig_label = ifelse(p.value < 0.05, "*", "")
  )

ggplot(or_table, aes(x = term, y = estimate)) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "gray") +
  geom_point() +
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.2) +
  scale_y_continuous("Odds Ratio") +
  scale_x_discrete("Predictor") +
  # geom_text(aes(label = sig_label, y = conf.high + 0.05), size = 6) +  
  theme_classic() +
  theme(
    text = element_text(family = "Calibri"),
    axis.text = element_text(size = 10),
    axis.title.x = element_text(size = 12, colour = "black", margin = margin(10, 0, 0, 0)),
    axis.title.y = element_text(size = 12, colour = "black", margin = margin(0, 10, 0, 0)),
    legend.title = element_blank(),
    legend.text = element_text(size = 10, colour = "black"),
    legend.position = "right"
  )

Nagelkerke’s R2

  • To quantify the variance attributable to the polygenic score (PGS), we compute: Nagelkerke’s R2 attributable to the PGS: Nagelkerke’s R2(full model) - Nagelkerke’s R2(covariates only model)
  • As we’re using a binary trait and a logistic regression we use a pseudo-R2 value, here Nagelkerke’s R2
  • Nagelkerke’s R2 is is an adjusted version of Cox and Snell’s R2, scaled to a range of 0 to 1 for easier interpretation
  • However, as a pseudo R2, this value cannot be interpreted as the proportion of variance explained in the same way R2 is used in a linear regression.
  • Instead this pseudo R2 is a relative measure of model fit, representing an approximation of explained variance:
    • 0 means the model explains none of the variation in the outcome
    • 1 means the model explains all the variation in the outcome
  • You can use some general thresholds to interpret Nagelkerke’s R2 (these thresholds can change across fields). E.g. :
    • 0 - 0.1: weak explanatory power
    • 0.1 - 0.3, moderate explanatory power
    • 0.3 - 0.5, strong explanatory power
    • > 0.5 very strong explanatory power
  • Nagelkerke’s R2 is calculated on the observed binary scale
    • Depends on the case/control ratio in your sample
    • Cannot be compared across studies with different case/control ascertainment
    • Does not estimate heritability
  • Thus, Nagelkerke’s R2 is not a very interpretable measure of variance explained.
r2_PGS <- r2_nagelkerke(logistic_regression_PGS) - r2_nagelkerke(logistic_regression_covar)

r2_PGS %>% 
  kable(caption = "Nagelkerke's R2 attributable to the PGS")
Nagelkerke’s R2 attributable to the PGS
x
Nagelkerke’s R2 0.0836051

R2 on the Liability Scale

  • Variance in outcome variable risk explained on the liability scale, attributable to PGS
  • We run linear regressions, even though our outcome is binary, to calculate R2 for our covariates + PGS model and covariates only model, then do R2(covariates + PGS) - R2 (covariates only) to calculate R2 attributable to PGS
  • This R2 is on the observed scale
    • It is biased by case/control ratio in your sample
  • So we adjust R2 to the liability scale, which reflects a latent (normally distributed) trait underlying disease risk.
  • For more details, see publication: Lee SH, et al., A Better Coefficient of Determination for Genetic Profile Analysis. Genetic Epidemiology, 2012. 36(3):214-224
  • Observed R2: tells you how well your PRS predicts actual case/control status in your sample.
  • Liability R2: tells you how much of the underlying genetic liability is explained by your PRS, standardized to population-level risk.
    • This is critical if you want to compare PRS performance across studies
    • Liability R2 can be compared to SNP-based heritability estimates, e.g. as estimated from using GWAS summary statistics in LDSC software

Calculate Vairance explained on Liability Scale

  • Function to convert observed R2 to R2 on liability scale
  • From Lee SH, et al., A Better Coefficient of Determination for Genetic Profile Analysis. Genetic Epidemiology, 2012. 36(3):214-224
# Variance explained by the PGS on the liability scale

# Convert observed R2 to the liability scale using poopulation prevalence of your disease/condition
# Use R code from Lee SH, et al., A Better Coefficient of Determination for Genetic Profile Analysis. Genetic Epidemiology, 2012. 36(3):214-224. 
# Made into a function
# lm = linear model
# K = population prevalence
# P = sample prevalence

mapToLiabilityScale = function(lm, K, P){
  thd = qnorm(1 - K) # the threshold on the normal distribution which truncates the proportion of disease prevalence K
  zv = dnorm(thd) # z (normal density)
  mv = zv/K # mean liability for case
  mv2 = -mv*K/(1-K) # mean liability for control
  
  y = lm$model[[1]]
  ncase = sum(y == 1)
  nt = length(y)
  ncont = nt - ncase
  R2O = var(lm$fitted.values)/(ncase/nt*ncont/nt) # R2 on the observed scale 
  
  theta = mv*(P-K)/(1-K)*(mv*(P-K)/(1-K)-thd) # theta in equation 15 of the publication
  cv = K*(1-K)/zv^2*K*(1-K)/(P*(1-P)) # C in equation 15 of the publication
  R2 = R2O*cv/(1+R2O*theta*cv)
  
  return(R2)
}
  • Fit linear models to calculate observed R2 for model with covariates only and model with caovariates + PGS
  • Convert observed R2 to R2 onliability scale using above function
  • Calculate R2 (liability) attributale to PGS and R2 (liability) (covariates + PGS) - R2 (liability) (covariates only)
  • Run bootstrapping to calculate confidence intervals
# Calculate difference in R2 on liability scale of full model - covariates only model to identify variance attributable to PGS
# Bootstrap to get 90% CI
# Doing a one-sided test of significance (> 0) therefore use a one-sided 95% CI (equivalent to just the lower bound of a 90% CI)

set.seed(123)  # for reproducibility
n_boot <- 1000

r2_diffs <- numeric(n_boot)
    
for (i in 1:n_boot) {
      # Resample data with replacement
      boot_data <- Df[sample(1:nrow(Df), replace = TRUE), ]
      
      # Fit models with resampled data
      lm_covar <- lm(Epilepsy ~ Sex + PC1 + PC2 + PC3 + PC4 + PC5 + PC6,
                   data = boot_data)

      lm_full <- lm(Epilepsy ~ SCORESUM + Sex + PC1 + PC2 + PC3 + PC4 + PC5 + PC6,
                    data = boot_data)
      
      # Calculate sample prevalence in the bootstrap sample
      ncase <- sum(boot_data$Epilepsy == 1)
      P_boot <- ncase / nrow(boot_data)
      
      # Liability R² (using lifetime population prevalence K as 0.012 (1.2%))
      r2_covar <- mapToLiabilityScale(lm_covar, K = 0.012, P = P_boot)
      r2_full <- mapToLiabilityScale(lm_full, K = 0.012, P = P_boot)
      
      r2_diffs[i] <- r2_full - r2_covar
    }
    
save(r2_diffs, file = "Results/r2_liability_attributable_PGS_bootstrapping.Rdata")

Results Table

# Doing a one-sided test of significance (> 0) therefore use a one-sided 95% CI (equivalent to just the lower bound of a 90% CI)
n_boot <- 1000
tibble(
  r2l_mean = mean(r2_diffs),
  r2l_SE = sd(r2_diffs) / sqrt(n_boot),
  r2l_Z = r2l_mean / r2l_SE,
  r2l_p = pnorm(r2l_Z, mean = 0, sd = 1, lower.tail = F),
  r2l_90perc_CI_lower = quantile(r2_diffs, 0.05),
  r2l_90perc_CI_upper = quantile(r2_diffs, 0.95)) %>% 
  kable(caption = "Variance explained in epilepsy risk on the liability scale attributable to epilepsy PGS and one-sided test of significance if R2 (liability) attributable to PGS is > 0")
Variance explained in epilepsy risk on the liability scale attributable to epilepsy PGS and one-sided test of significance if R2 (liability) attributable to PGS is > 0
r2l_mean r2l_SE r2l_Z r2l_p r2l_90perc_CI_lower r2l_90perc_CI_upper
0.0511901 0.0006007 85.21772 0 0.0247302 0.0859619
  • R2 (liability) = 0.08, which means 8% of the variance in epilepsy risk (on the liabilty scale) is attributable to the epilepsy polygenic score
  • This value can be compared to SNP-based heritability (on liability scale), as both reflect the proportion of variance in liability explained by common SNPs

Odds Ratio by Decile of PGS

  • Cut the distribution of PGS into deciles, with each decile containing cases and controls
  • Calculate the odds ratio of each PGS decile compared to the first (lowest) PGS decile or the middle (5th and 6th) deciles
  • This is a practical and interpretable way to visualise that there could be utility in using high vs low PGS
  • But this method also doesn’t take into account the proportion of cases and controls in your data - it will look much more impressive if you have a data set with 50% cases and 50% controls, compared to a population sample.

Create PGS Decile Data

# Create deciles of PGS and set first decile as the reference/base group
Df_lowest <- Df %>%
  mutate(
    PGS_decile = ntile(scale(SCORESUM), 10),
    PGS_decile = factor(PGS_decile, levels = 1:10)
  )

# Create deciles of PGS and set middle deciles as the reference/base group
Df_mid <- Df %>%
  mutate(
    PGS_decile = ntile(scale(SCORESUM), 10),
    PGS_decile = case_when(
      PGS_decile %in% c("5", "6") ~ "5_6",
      TRUE ~ as.character(PGS_decile)
    ),
    PGS_decile = factor(PGS_decile, levels = c('5_6','1','2','3','4','7','8','9','10'))
  )

Fit Model

Check Model

Reference: Lowest decile

performance::check_model(decile_model_lowest)

simulateResiduals(decile_model_lowest, plot=TRUE)

## Object of Class DHARMa with simulated residuals based on 250 simulations with refit = FALSE . See ?DHARMa::simulateResiduals for help. 
##  
## Scaled residual values: 0.4808425 0.4006835 0.9144018 0.9049724 0.9392691 0.6235637 0.6623095 0.4688741 0.5045213 0.638836 0.9609655 0.7475587 0.3199623 0.3425676 0.4193943 0.8649877 0.8471555 0.8116823 0.497919 0.9103658 ...

Reference: 5th and 6th deciles

performance::check_model(decile_model_mid)

simulateResiduals(decile_model_mid, plot=TRUE)

## Object of Class DHARMa with simulated residuals based on 250 simulations with refit = FALSE . See ?DHARMa::simulateResiduals for help. 
##  
## Scaled residual values: 0.4808425 0.4006835 0.9144018 0.9040315 0.9392691 0.6235637 0.6623095 0.4688741 0.5045213 0.638836 0.9609655 0.7159717 0.3199623 0.3425676 0.4193943 0.8649877 0.8471555 0.8116823 0.5231941 0.9085732 ...

Results

Reference: Lowest decile

Table

  • Odds ratio for each PGS decile compared to the first decile
  • Adjust p-value for nine tests, using Benjamini-Hochberg method
tidy(decile_model_lowest, exponentiate = TRUE, conf.int = TRUE) %>%
  filter(str_detect(term, "PGS_decile")) %>%
  mutate(p.value_adjusted = p.adjust(p.value, method = "BH", n = 9)) %>% 
  kable(caption = "Results from logistic regression with decile of PGS as predictor and lowest decile as the reference (Back-transformation conducted so estimate = odds ratio. P-value adjusted for 9 comparisons with Benjamini-Hochberg method)")
Results from logistic regression with decile of PGS as predictor and lowest decile as the reference (Back-transformation conducted so estimate = odds ratio. P-value adjusted for 9 comparisons with Benjamini-Hochberg method)
term estimate std.error statistic p.value conf.low conf.high p.value_adjusted
PGS_decile2 2.128884 0.7436129 1.016118 0.3095734 0.5212012 10.67241 0.3095734
PGS_decile3 4.183359 0.6993374 2.046387 0.0407183 1.1683779 19.82282 0.0732930
PGS_decile4 2.412578 0.7444925 1.182948 0.2368297 0.5907471 12.11951 0.2664334
PGS_decile5 2.556933 0.7286031 1.288504 0.1975704 0.6552924 12.57789 0.2540191
PGS_decile6 3.934688 0.6991824 1.959191 0.0500905 1.0988201 18.63585 0.0751357
PGS_decile7 4.434315 0.6948789 2.143357 0.0320845 1.2547005 20.89110 0.0721901
PGS_decile8 7.801298 0.6804313 3.019100 0.0025353 2.3036791 36.09150 0.0114087
PGS_decile9 5.646489 0.6851648 2.526449 0.0115222 1.6438500 26.27289 0.0345666
PGS_decile10 12.551283 0.6748478 3.748731 0.0001777 3.7770468 57.71648 0.0015996

Graph

or_table <- tidy(decile_model_lowest, exponentiate = TRUE, conf.int = TRUE, conf.level = 0.95,) %>%
  filter(str_detect(term, "PGS_decile")) %>%
  add_row(
    term = "PGS_decile1",
    estimate = 1,
    conf.low = 1,
    conf.high = 1,
    p.value = NA) %>% 
  mutate(
    decile = str_remove(term, "PGS_decile"),
    decile = factor(decile, levels = as.character(1:10)),
    p.value_adjusted = p.adjust(p.value, method = "BH", n = 9),
    sig_label = ifelse(p.value_adjusted < 0.05, "*", "")
  ) 

ggplot(or_table, aes(x = decile, y = estimate)) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "gray") +
  geom_point() +
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.2) +
  scale_y_continuous("Odds Ratio") +
  scale_x_discrete("PGS Decile") +
  geom_text(aes(label = sig_label, y = conf.high + 1), size = 6) +  
  theme_classic() +
  theme(
    text = element_text(family = "Calibri"),
    axis.text = element_text(size = 10),
    axis.title.x = element_text(size = 12, colour = "black", margin = margin(10, 0, 0, 0)),
    axis.title.y = element_text(size = 12, colour = "black", margin = margin(0, 10, 0, 0)),
    legend.title = element_blank(),
    legend.text = element_text(size = 10, colour = "black"),
    legend.position = "right"
  )

Reference: 5th and 6th decile

Table

  • Odds ratio for each PGS decile compared to the middle (5th and 6th) deciles
  • Adjust p-value for nine tests, using Benjamini-Hochberg method
tidy(decile_model_mid, exponentiate = TRUE, conf.int = TRUE) %>%
  filter(str_detect(term, "PGS_decile")) %>%
  mutate(p.value_adjusted = p.adjust(p.value, method = "BH", n = 8)) %>% 
  kable(caption = "Results from logistic regression with decile of PGS as predictor and middle deciles (5th and 6th) as the reference (Back-transformation conducted so estimate = odds ratio. P-value adjusted for 9 comparisons with Benjamini-Hochberg method)")
Results from logistic regression with decile of PGS as predictor and middle deciles (5th and 6th) as the reference (Back-transformation conducted so estimate = odds ratio. P-value adjusted for 9 comparisons with Benjamini-Hochberg method)
term estimate std.error statistic p.value conf.low conf.high p.value_adjusted
PGS_decile1 0.3103084 0.6577260 -1.7791427 0.0752164 0.0693238 0.9966392 0.2005770
PGS_decile2 0.6602868 0.5188219 -0.8000453 0.4236845 0.2216661 1.7482806 0.5743983
PGS_decile3 1.2986998 0.4502903 0.5804336 0.5616222 0.5231437 3.1045951 0.5743983
PGS_decile4 0.7483146 0.5162735 -0.5615858 0.5743983 0.2530874 1.9749678 0.5743983
PGS_decile7 1.3746537 0.4426014 0.7189354 0.4721807 0.5650837 3.2476429 0.5743983
PGS_decile8 2.4158366 0.4204529 2.0978466 0.0359187 1.0550577 5.5355786 0.1436748
PGS_decile9 1.7473400 0.4273181 1.3060402 0.1915389 0.7486755 4.0388934 0.3830778
PGS_decile10 3.8839014 0.4097572 3.3113274 0.0009285 1.7515447 8.7914692 0.0074284

Graph

or_table <- tidy(decile_model_mid, exponentiate = TRUE, conf.int = TRUE, conf.level = 0.95,) %>%
  filter(str_detect(term, "PGS_decile")) %>%
  add_row(
    term = "PGS_decile5",
    estimate = 1,
    conf.low = 1,
    conf.high = 1,
    p.value = NA) %>% 
  mutate(
    decile = str_remove(term, "PGS_decile"),
    decile = factor(decile, levels = as.character(1:10)),
    p.value_adjusted = p.adjust(p.value, method = "BH", n = 9),
    sig_label = ifelse(p.value_adjusted < 0.05, "*", "")
  ) 

ggplot(or_table, aes(x = decile, y = estimate)) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "gray") +
  geom_point() +
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.2) +
  scale_y_continuous("Odds Ratio") +
  scale_x_discrete("PGS Decile",
                   labels = c('1', '2', '3', '4', '5 & 6', '7', '8', '9', '10')) +
  geom_text(aes(label = sig_label, y = conf.high + 1), size = 6) +  
  theme_classic() +
  theme(
    text = element_text(family = "Calibri"),
    axis.text = element_text(size = 10),
    axis.title.x = element_text(size = 12, colour = "black", margin = margin(10, 0, 0, 0)),
    axis.title.y = element_text(size = 12, colour = "black", margin = margin(0, 10, 0, 0)),
    legend.title = element_blank(),
    legend.text = element_text(size = 10, colour = "black"),
    legend.position = "right"
  )

Area under the Curve (AUC)

  • AUC from Receiver Operating Characteristic (ROC) analysis
  • AUC = Probability that a randomly selected case has a higher test score than a randomly selected control
  • Range 0.5 to 1
    • 0.5 = no discrimination of cases from controls
    • 1 = perfect discrimination
  • The AUC is a measure of how well the PGS predicts a binary outcome (case/control status)
  • The AUC is independent to proportion of cases and controls in sample
  • But, the AUC is constrained by how well your PGS reflects the underlying genotype-to-phenotype relationship
    • If your PRS does not accurately capture the genetic architecture, then AUC will be low even if there is true heritability
    • The PRS is a proxy, it doesn’t measure true genetic risk perfectly
    • So AUC is ultimatley limited by how well the PRS captures phenotype-genotype associations
  • The maximum AUC achievable depend on the heritability of the disease
    • The theoretical maximum AUC a PRS can achieve is limited by the heritability of the trait.
    • For traits with low SNP heritability (h2), even a perfect PRS (trained on infinite data) can’t fully separate cases and controls.
  • So the AUC has a problem with genetic interpretation
    • A high or low AUC doesn’t directly tell you about genetic architecture
    • You can’t assume that a low AUC means genetics don’t matter. Maybe your PGS is underpowered or poorly constructed, or maybe the trait is polygenic with small effects spread over many SNPs, limiting predictive ability.

Method 1: PGS Only

  • ROC analysis only using the PGS as a predictor

ROC analysis

model_PGS <- glm(Epilepsy ~ scale(SCORESUM), 
                 family = binomial(link = 'logit'), 
                 data = Df)

roc_model_PGS <- roc(Df$Epilepsy, model_PGS$fitted.values)
  
save(roc_model_PGS, file = "Results/roc_model_PGS.Rdata")

Results

Table

# Doing a one-sided test of significance (> 0.5) therefore use a one-sided 95%CI (equivalent to just the lower bound of a 90% CI)
tibble(
      AUC = as.numeric(auc(roc_model_PGS)),
      AUC_SE = sqrt(var(roc_model_PGS)),
      AUC_Z = (AUC - 0.5) / AUC_SE,
      AUC_p = pnorm(AUC_Z, mean = 0, sd = 1, lower.tail = F),
      AUC_90perc_CI_lower = as.numeric(ci(roc_model_PGS, conf.level = 0.90))[1]) %>% 
  kable(caption = "AUC results and one-sided test of significance if AUC is > 0.5")
AUC results and one-sided test of significance if AUC is > 0.5
AUC AUC_SE AUC_Z AUC_p AUC_90perc_CI_lower
0.6564001 0.0303257 5.157345 1e-07 0.6065188

Graph

plot(roc_model_PGS,
       print.auc = TRUE,
       main = paste("ROC curve for PGS of epilepsy"))

Method 2: PGS + Covariates

  • Find AUC attributable to PGS
  • AUC(covariates plus PGS) - AUC(covariates only)

ROC analysis

# Use logistic regression models to run ROC analysis for covariates only and covariats + PGS
roc_model_covar <- roc(Df$Epilepsy, logistic_regression_covar$fitted.values)
roc_model_full <- roc(Df$Epilepsy, logistic_regression_PGS$fitted.values)

save(roc_model_covar, roc_model_full, file = "Results/roc_models_covar_full.Rdata")

Results

Table

# DeLong's test for two correlated ROC curves
sig_test <- roc.test(roc_model_full, roc_model_covar)

tibble(
    AUC_covariates_only = as.numeric(auc(roc_model_covar)),
    AUC_covariates_plus_PGS = as.numeric(auc(roc_model_full)),
    AUC_change = AUC_covariates_only - AUC_covariates_plus_PGS,
    AUC_change_Z = sig_test[["statistic"]][["Z"]],
    AUC_change_p = sig_test[["p.value"]],
    AUC_change_CI_lower = sig_test$conf.int[1],
    AUC_change_CI_upper = sig_test$conf.int[2]) %>% 
    kable(caption = "AUC results and DeLong's test for two correlated ROC curves (does adding PGS to analysis significanlty increase AUC?)")
AUC results and DeLong’s test for two correlated ROC curves (does adding PGS to analysis significanlty increase AUC?)
AUC_covariates_only AUC_covariates_plus_PGS AUC_change AUC_change_Z AUC_change_p AUC_change_CI_lower AUC_change_CI_upper
0.5949923 0.6779254 -0.0829331 2.706189 0.006806 0.0228686 0.1429975

Graph

roc_covar_df <- ggroc(roc_model_covar) %>% 
    .[["data"]] %>% 
    mutate(model = "covar")
  
roc_full_df  <- ggroc(roc_model_full) %>% 
    .[["data"]] %>% 
    mutate(model = "full")

roc_combined_df <- roc_covar_df %>% 
    bind_rows(roc_full_df)
  
auc_covar <- round(auc(roc_model_covar), 3)
auc_full  <- round(auc(roc_model_full), 3)
  
ggplot(roc_combined_df, aes(x = specificity, y = sensitivity, colour = model, linetype = model)) +
    geom_line() +
    scale_x_reverse("Specificity") +
    scale_y_continuous("Sensitivity") +
    scale_linetype_manual(values = c("covar" = "solid", "full" = "dashed")) +
    scale_colour_manual(values = c("covar" = "#160F3BFF", "full" = "#F4685CFF")) +
    annotate("text", 
             x = max(roc_combined_df$specificity, na.rm = TRUE) - 0.1, 
             y = min(roc_combined_df$sensitivity, na.rm = TRUE) + 0.05, hjust = 0, 
             label = paste("AUC (Covariates only) =", auc_covar), color = "#160F3BFF") +
    annotate("text", 
             x = max(roc_combined_df$specificity, na.rm = TRUE) - 0.1, 
             y = min(roc_combined_df$sensitivity, na.rm = TRUE) + 0.05 + 0.05, 
             hjust = 0, label = paste("AUC (Covariates + PGS) =", auc_full), color = "#F4685CFF") +
    theme_classic() +
    theme(text = element_text(family = "Calibri"),
          axis.text = element_text(size = 10),
          axis.title.x = element_text(size = 12, colour = "black", margin = margin(10,0,0,0)),
          axis.title.y = element_text(size = 12, colour = "black", margin = margin(0,10,0,0)),
          legend.title = element_blank(),
          legend.text = element_text(size = 10, colour = "black"),
          legend.position = "none")

  ## ----